# Copyright 2019 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file contains utilities for analyzers."""

import logging
import re
from six.moves import urllib_parse as urlparse

import numpy

from timesketch.lib.analyzers import interface


logger = logging.getLogger("timesketch.analyzer_utils")

# Title and header text of a story that is common among browser
# based analyzers.
BROWSER_STORY_TITLE = "Browser Artifacts"
BROWSER_STORY_HEADER = """
This is an automatically generated story that browser history
based analyzers contribute to. Each section in this story
is generated by a separate analyzer.
"""

# Title and header text of a story that is common among browser
# based analyzers.
SIGMA_STORY_TITLE = "Sigma Artifacts"
SIGMA_STORY_HEADER = """
This is an automatically generated story that Sigma
based analyzers contribute to.
"""

# CDN domain list based on:
# https://github.com/WPO-Foundation/webpagetest/blob/master/agent/wpthook/cdn.h
# Last updated: 2019-01-11
KNOWN_CDN_DOMAINS = {
    ".att-dsa.net": "AT&T",
    ".pix-cdn.org": "Advanced Hosters CDN",
    ".akamai.net": "Akamai",
    ".akamaiedge.net": "Akamai",
    ".akamaihd.net": "Akamai",
    ".akamaitechnologies.com": "Akamai",
    ".akamaitechnologies.fr": "Akamai",
    ".akamaized.net": "Akamai",
    ".edgekey.net": "Akamai",
    ".edgesuite.net": "Akamai",
    ".srip.net": "Akamai",
    ".tl88.net": "Akamai China CDN",
    ".gslb.tbcache.com": "Alimama",
    ".cloudfront.net": "Amazon CloudFront",
    ".aads-cn.net": "Aryaka",
    ".aads-cng.net": "Aryaka",
    ".aads1.net": "Aryaka",
    ".azion.net": "Azion",
    ".azioncdn.com": "Azion",
    ".azioncdn.net": "Azion",
    ".bo.lt": "BO.LT",
    ".bisongrid.net": "Bison Grid",
    ".bitgravity.com": "BitGravity",
    ".bluehatnetwork.com": "Blue Hat Network",
    ".b-cdn.net": "BunnyCDN",
    ".cdn77.net": "CDN77",
    ".cdn77.org": "CDN77",
    ".cdngc.net": "CDNetworks",
    ".gccdn.net": "CDNetworks",
    ".panthercdn.com": "CDNetworks",
    ".cdnsun.net": "CDNsun",
    ".cdnvideo.net": "CDNvideo",
    ".cdnvideo.ru": "CDNvideo",
    ".cachefly.net": "Cachefly",
    ".caspowa.com": "Caspowa",
    ".cedexis.net": "Cedexis",
    ".ccgslb.com": "ChinaCache",
    ".lxdns.com": "ChinaNetCenter",
    ".ourwebpic.com": "ChinaNetCenter",
    ".wscdns.com": "ChinaNetCenter",
    ".wscloudcdn.com": "ChinaNetCenter",
    ".cloudflare.com": "Cloudflare",
    ".cotcdn.net": "Cotendo CDN",
    ".systemcdn.net": "Edgecast",
    ".transactcdn.net": "Edgecast",
    ".v1cdn.net": "Edgecast",
    ".v2cdn.net": "Edgecast",
    ".v3cdn.net": "Edgecast",
    ".v4cdn.net": "Edgecast",
    ".v5cdn.net": "Edgecast",
    ".edgecastcdn.net": "Edgecast",
    ".cdninstagram.com": "Facebook",
    ".fbcdn.net": "Facebook",
    ".fastly.net": "Fastly",
    ".fastlylb.net": "Fastly",
    ".nocookie.net": "Fastly",
    ".cdn.gocache.net": "GoCache",
    ".doubleclick.net": "Google",
    ".googleusercontent.com": "Google",
    ".gstatic.com": "Google",
    ".googlehosted.com": "Google",
    ".googlesyndication.": "Google",
    ".hiberniacdn.com": "HiberniaCDN",
    ".hwcdn.net": "Highwinds",
    ".hosting4cdn.com": "Hosting4CDN",
    ".incapdns.net": "Incapsula",
    ".inscname.net": "Instart Logic",
    ".insnw.net": "Instart Logic",
    ".internapcdn.net": "Internap",
    ".kinxcdn.com": "KINX CDN",
    ".kinxcdn.net": "KINX CDN",
    ".kxcdn.com": "KeyCDN",
    ".lswcdn.eu": "LeaseWeb CDN",
    ".lswcdn.net": "LeaseWeb CDN",
    ".footprint.net": "Level 3",
    ".fpbns.net": "Level 3",
    ".llnwd.net": "Limelight",
    ".cdncloud.net.au": "MediaCloud",
    ".mncdn.com": "Medianova",
    ".mncdn.net": "Medianova",
    ".mncdn.org": "Medianova",
    ".azure.microsoft.com": "Microsoft Azure",
    ".azureedge.net": "Microsoft Azure",
    ".vo.msecnd.net": "Microsoft Azure",
    ".instacontent.net": "Mirror Image",
    ".mirror-image.net": "Mirror Image",
    ".ngenix.net": "NGENIX",
    ".nyiftw.com": "NYI FTW",
    ".nyiftw.net": "NYI FTW",
    ".netdna-cdn.com": "NetDNA",
    ".netdna-ssl.com": "NetDNA",
    ".netdna.com": "NetDNA",
    ".netlify.com": "Netlify",
    ".r.worldcdn.net": "OnApp",
    ".r.worldssl.net": "OnApp",
    ".optimalcdn.com": "Optimal CDN",
    ".pagerain.net": "PageRain",
    ".raxcdn.com": "Rackspace",
    ".resrc.it": "ReSRC.it",
    ".rlcdn.com": "Reapleaf",
    ".rncdn1.com": "Reflected Networks",
    ".rncdn7.com": "Reflected Networks",
    ".revcn.net": "Rev Software",
    ".revdn.net": "Rev Software",
    ".roast.io": "Roast.io",
    ".streamprovider.net": "Rocket CDN",
    ".cdn.sfr.net": "SFR",
    ".simplecdn.net": "Simple CDN",
    ".singularcdn.net.br": "Singular CDN",
    ".stackpathdns.com": "StackPath",
    ".swiftcdn1.com": "SwiftCDN",
    ".swiftserve.com": "SwiftCDN",
    ".trbcdn.ru": "TRBCDN",
    ".gslb.taobao.com": "Taobao",
    ".taobaocdn.com": "Taobao",
    ".tbcdn.cn": "Taobao",
    ".cdntel.net": "Telenor",
    ".twimg.com": "Twitter",
    ".unicorncdn.net": "UnicornCDN",
    ".voxcdn.net": "VoxCDN",
    ".gravatar.com": "WordPress",
    ".wordpress.com": "WordPress",
    ".wp.com": "WordPress",
    ".ay1.b.yahoo.com": "Yahoo",
    ".yahooapis.com": "Yahoo",
    ".yimg.": "Yahoo",
    ".yottaa.net": "Yottaa",
    ".zenedge.net": "Zenedge",
    ".afxcdn.net": "afxcdn.net",
    ".cubecdn.net": "cubeCDN",
    ".cdn.jsdelivr.net": "jsDelivr",
    ".squixa.net": "section.io",
}


def get_domain_from_url(url):
    """Extract domain from URL.

    Args:
        url: URL to parse.

    Returns:
        String with domain from URL.
    """
    # TODO: See if we can optimize this because it is rather slow.
    domain_parsed = urlparse.urlparse(url)
    domain_full = domain_parsed.netloc
    domain, _, _ = domain_full.partition(":")
    return domain


def get_tld_from_domain(domain):
    """Get the top level domain from a domain string.

    Args:
        domain: string with a full domain, eg. www.google.com

    Returns:
        string: TLD or a top level domain extracted from the domain,
        eg: google.com
    """
    return ".".join(domain.split(".")[-2:])


def strip_www_from_domain(domain):
    """Strip www. from beginning of domain names.

    Args:
        domain: string with a full domain, eg. www.google.com

    Returns:
        string: Domain without any www, eg: google.com
    """
    if domain.startswith("www."):
        return domain[4:]
    return domain


def get_cdn_provider(domain):
    """Return name of CDN provider if domain is recognized as a CDN.

    Args:
        domain: Domain name to check against CDN list.

    Returns:
        String of names of CDN providers or empty string if not found.

    """
    cdn_providers = [
        v for k, v in iter(KNOWN_CDN_DOMAINS.items()) if domain.endswith(k.lower())
    ]
    return " ".join(set(cdn_providers))


def _fix_np_nan(source_dict, attribute, replace_with=None):
    """Replaces a numpy.nan value within dict with another value.

    Args:
        source_dict: a dictionary.
        attribute: string that is a key to the dictionary, in which
            numpy.nan values will be replaced by the supplied value.
        replace_with: the value that will be used as a replacement for
            the numpy.nan. If not supplied an empty list will be used.
    """
    if replace_with is None:
        replace_with = []

    value = source_dict.get(attribute)
    try:
        if numpy.isnan(value):
            source_dict[attribute] = replace_with
    except TypeError:
        # The value does not need to be changed.
        pass


def get_events_from_data_frame(frame, datastore):
    """Generates events from a data frame.

    Args:
        frame: a pandas DataFrame object.
        datastore: OpenSearch datastore client.

    Yields:
        An event (interface.Event) object for each row
        in the DataFrame.
    """
    for row in frame.iterrows():
        _, entry = row
        event_id = entry.get("_id")
        if not event_id:
            continue
        event_index = entry.get("_index")
        if not event_index:
            continue

        source = entry.to_dict()
        _fix_np_nan(source, "__ts_emojis")
        _fix_np_nan(source, "human_readable", replace_with="")
        _fix_np_nan(source, "tag")

        datetime = source.get("datetime")
        if hasattr(datetime, "to_pydatetime"):
            datetime_string = datetime.to_pydatetime().isoformat()
            source["datetime"] = datetime_string

        event_dict = dict(_id=event_id, _index=event_index, _source=source)
        yield interface.Event(event_dict, datastore)


def compile_regular_expression(
    expression_string, expression_flags=None, expression_parameters=None
):
    """Returns a compiled regular expression from parameters.

    Args:
        expression_string (str): The regular expression string to compile. It
            can contain parameters that will be replaced before compilation.
        expression_flags (list): Optional list of flags for the regular
            expression.
        expression_parameters (dict): Optional dict with the keys as the
            parameter names and values as the values that the keys in the
            expression string should replace.

    Returns:
        A compiled regular expression (instance of re.Pattern).
    """
    if expression_flags:
        flags = set()
        for flag in expression_flags:
            try:
                flags.add(getattr(re, flag))
            except AttributeError:
                logger.warning(
                    "Unknown regular expression flag defined " "-> {0:s}.".format(flag)
                )
        re_flag = sum(flags)
    else:
        re_flag = 0

    if expression_parameters:
        expression_string = expression_string.format(**expression_parameters)

    try:
        expression = re.compile(expression_string, flags=re_flag)
    except re.error:
        logger.error(
            "Regular expression [{0:s}] failed to " "compile".format(expression_string),
            exc_info=True,
        )
        expression = None

    return expression
